Import libraries

library(corrplot)
library(cluster)
library(factoextra)
library(reshape2)
library(tidyverse)
library(RColorBrewer)
library(scales)
library(fpc)
library(EnvStats)
library(NbClust)
library(dendextend)
library(gtools)
library(ClusterR)
library(dendextend)

Introduction

The dataset is related to variety of wine variants. The goal is to model wine quality based on physicochemical tests.

Objective

These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are many more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.

Data Analysis

The data set contains 18 variables that involve wine realted attributes.

Data Dictionary

Column Name Description
1 - fixed acidity fixed acidity of the wine
2 - volatile acidity volatile acidity of the wine
3 - citric acid citric acid in wine observation
4 - residual sugar residual sugar in wine
5 - chlorides cholorides in wine
6 - free sulfur dioxide free sulfur dioxide
7 - total sulfur dioxide total sulfur dioxide
8 - density Wine density
9 - pH Wine pH value
10 - sulphates sulphates in wine
11 - alcohol Wine alchocoal level
12 - #add rowid (0 to 10) wien quality based on based on sensory data)
13 - country country origin of wine
14 - province province origin of wine
15 - region_1 Region of wine origin
16 - region_2 sub-region of wine origin
17 - variety Type of wine
18 - winery Name of the winery

Load dataset

Read data

#Read data
raw <- read.csv('./data/wine-with-type-location.csv', header=TRUE, sep = ";")

Add rowid

#add rowid
raw <- tibble::rowid_to_column(raw, "ROWID")
head(raw, 5)
##   ROWID fixed.acidity volatile.acidity citric.acid residual.sugar
## 1     1           7.4             0.70        0.00            1.9
## 2     2           7.8             0.88        0.00            2.6
## 3     3           7.8             0.76        0.04            2.3
## 4     4          11.2             0.28        0.56            1.9
## 5     5           7.4             0.70        0.00            1.9
##   chlorides free.sulfur.dioxide total.sulfur.dioxide density   pH
## 1     0.076                  11                   34  0.9978 3.51
## 2     0.098                  25                   67  0.9968 3.20
## 3     0.092                  15                   54  0.9970 3.26
## 4     0.075                  17                   60  0.9980 3.16
## 5     0.076                  11                   34  0.9978 3.51
##   sulphates alcohol quality  country          province            region_1
## 1      0.56     9.4       5    Italy Sicily & Sardinia                Etna
## 2      0.68     9.8       5 Portugal             Douro                    
## 3      0.65     9.8       5       US            Oregon   Willamette Valley
## 4      0.58     9.8       6       US          Michigan Lake Michigan Shore
## 5      0.56     9.4       5       US            Oregon   Willamette Valley
##            region_2        variety              winery
## 1                      White Blend             Nicosia
## 2                   Portuguese Red Quinta dos Avidagos
## 3 Willamette Valley     Pinot Gris           Rainstorm
## 4                         Riesling          St. Julian
## 5 Willamette Valley     Pinot Noir        Sweet Cheeks
# summary(raw)

#numeric data,  'quality' variable is dependent variable 
data <- raw[c(1:12)]
summary(data)
##      ROWID        fixed.acidity   volatile.acidity  citric.acid   
##  Min.   :   1.0   Min.   : 4.60   Min.   :0.1200   Min.   :0.000  
##  1st Qu.: 400.5   1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090  
##  Median : 800.0   Median : 7.90   Median :0.5200   Median :0.260  
##  Mean   : 800.0   Mean   : 8.32   Mean   :0.5278   Mean   :0.271  
##  3rd Qu.:1199.5   3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420  
##  Max.   :1599.0   Max.   :15.90   Max.   :1.5800   Max.   :1.000  
##  residual.sugar     chlorides       free.sulfur.dioxide
##  Min.   : 0.900   Min.   :0.01200   Min.   : 1.00      
##  1st Qu.: 1.900   1st Qu.:0.07000   1st Qu.: 7.00      
##  Median : 2.200   Median :0.07900   Median :14.00      
##  Mean   : 2.539   Mean   :0.08747   Mean   :15.87      
##  3rd Qu.: 2.600   3rd Qu.:0.09000   3rd Qu.:21.00      
##  Max.   :15.500   Max.   :0.61100   Max.   :72.00      
##  total.sulfur.dioxide    density             pH          sulphates     
##  Min.   :  6.00       Min.   :0.9901   Min.   :2.740   Min.   :0.3300  
##  1st Qu.: 22.00       1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500  
##  Median : 38.00       Median :0.9968   Median :3.310   Median :0.6200  
##  Mean   : 46.47       Mean   :0.9967   Mean   :3.311   Mean   :0.6581  
##  3rd Qu.: 62.00       3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300  
##  Max.   :289.00       Max.   :1.0037   Max.   :4.010   Max.   :2.0000  
##     alcohol     
##  Min.   : 8.40  
##  1st Qu.: 9.50  
##  Median :10.20  
##  Mean   :10.42  
##  3rd Qu.:11.10  
##  Max.   :14.90

Structure of data

No NAs or Nans or blanks found

str(data)
## 'data.frame':    1599 obs. of  12 variables:
##  $ ROWID               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
nrow(data)
## [1] 1599

Visualizations

#Melt data
melt_data = melt(data, id.vars=c("ROWID"))

#visualize spread of data
ggplot(melt_data,  mapping = aes(x = value)) + geom_bar(fill = "#FF6666") + facet_wrap(~variable, scales = 'free_x')

boxplot(data[,-c(1)])

Above shows data needs scaling and has outliers

Checking outlier

#3 outlier
rosnerTest(data$volatile.acidity, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$volatile.acidity
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 5.876138
##                                  R.2 = 4.531485
##                                  R.3 = 4.562348
##                                  R.4 = 4.079513
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     3
## 
##   i    Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 0.5278205 0.1790597  1.58    1300 5.876138   4.153388    TRUE
## 2 1 0.5271621 0.1771688  1.33     127 4.531485   4.153240    TRUE
## 3 2 0.5266594 0.1760805  1.33     128 4.562348   4.153091    TRUE
## 4 3 0.5261560 0.1749827  1.24     673 4.079513   4.152943   FALSE
#0 outliers
rosnerTest(data$citric.acid, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$citric.acid
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 3.742403
##                                  R.2 = 2.677655
##                                  R.3 = 2.632885
##                                  R.4 = 2.535969
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     0
## 
##   i    Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 0.2709756 0.1948011  1.00     152 3.742403   4.153388   FALSE
## 2 1 0.2705194 0.1940058  0.79     354 2.677655   4.153240   FALSE
## 3 2 0.2701941 0.1936301  0.78    1575 2.632885   4.153091   FALSE
## 4 3 0.2698747 0.1932695  0.76     259 2.535969   4.152943   FALSE
#4 outliers
rosnerTest(data$residual.sugar, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$residual.sugar
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 9.192806
##                                  R.2 = 9.376226
##                                  R.3 = 9.648666
##                                  R.4 = 8.788466
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     4
## 
##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 2.538806 1.409928  15.5     481 9.192806   4.153388    TRUE
## 2 1 2.530695 1.372546  15.4    1435 9.376226   4.153240    TRUE
## 3 2 2.522636 1.334626  15.4    1436 9.648666   4.153091    TRUE
## 4 3 2.514568 1.295497  13.9    1575 8.788466   4.152943    TRUE
#4 outliers
rosnerTest(data$chlorides, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$chlorides
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 11.123555
##                                  R.2 = 11.562755
##                                  R.3 =  8.780833
##                                  R.4 =  8.932900
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     4
## 
##   i     Mean.i       SD.i Value Obs.Num     R.i+1 lambda.i+1 Outlier
## 1 0 0.08746654 0.04706530 0.611     259 11.123555   4.153388    TRUE
## 2 1 0.08713892 0.04521942 0.610     152 11.562755   4.153240    TRUE
## 3 2 0.08681152 0.04329754 0.467     107  8.780833   4.153091    TRUE
## 4 3 0.08657331 0.04225130 0.464      82  8.932900   4.152943    TRUE
#4 outliers
rosnerTest(data$free.sulfur.dioxide, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$free.sulfur.dioxide
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 5.365606
##                                  R.2 = 5.030550
##                                  R.3 = 5.072499
##                                  R.4 = 4.919615
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     4
## 
##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 15.87492 10.46016    72    1245 5.365606   4.153388    TRUE
## 2 1 15.83980 10.36869    68     397 5.030550   4.153240    TRUE
## 3 2 15.80714 10.28938    68     401 5.072499   4.153091    TRUE
## 4 3 15.77444 10.20925    66    1559 4.919615   4.152943    TRUE
#0 outliers
rosnerTest(data$density, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$density
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 3.678904
##                                  R.2 = 3.695748
##                                  R.3 = 3.561134
##                                  R.4 = 3.576495
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     0
## 
##   i    Mean.i        SD.i   Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 0.9967467 0.001887334 1.00369    1435 3.678904   4.153388   FALSE
## 2 1 0.9967423 0.001879908 1.00369    1436 3.695748   4.153240   FALSE
## 3 2 0.9967380 0.001872433 0.99007    1018 3.561134   4.153091   FALSE
## 4 3 0.9967422 0.001865559 0.99007    1019 3.576495   4.152943   FALSE
#2 outliers
rosnerTest(data$pH, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$pH
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 4.526866
##                                  R.2 = 4.557617
##                                  R.3 = 3.867629
##                                  R.4 = 3.887110
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     2
## 
##   i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 3.311113 0.1543865  4.01    1317 4.526866   4.153388    TRUE
## 2 1 3.310676 0.1534408  4.01    1322 4.557617   4.153240    TRUE
## 3 2 3.310238 0.1524867  3.90      46 3.867629   4.153091   FALSE
## 4 3 3.309868 0.1518176  3.90     696 3.887110   4.152943   FALSE
#4 outliers
rosnerTest(data$sulphates, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$sulphates
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 7.916200
##                                  R.2 = 7.958429
##                                  R.3 = 7.939605
##                                  R.4 = 8.103844
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     4
## 
##   i    Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 0.6581488 0.1695070  2.00     152 7.916200   4.153388    TRUE
## 2 1 0.6573091 0.1662000  1.98      93 7.958429   4.153240    TRUE
## 3 2 0.6564809 0.1629198  1.95      87 7.939605   4.153091    TRUE
## 4 3 0.6556704 0.1597180  1.95      92 8.103844   4.152943    TRUE
#1 outlier
rosnerTest(data$alcohol, k = 4, warn = F)
## 
## Results of Outlier Test
## -------------------------
## 
## Test Method:                     Rosner's Test for Outliers
## 
## Hypothesized Distribution:       Normal
## 
## Data:                            data$alcohol
## 
## Sample Size:                     1599
## 
## Test Statistics:                 R.1 = 4.201138
##                                  R.2 = 3.376887
##                                  R.3 = 3.390076
##                                  R.4 = 3.403421
## 
## Test Statistic Parameter:        k = 4
## 
## Alternative Hypothesis:          Up to 4 observations are not
##                                  from the same Distribution.
## 
## Type I Error:                    5%
## 
## Number of Outliers Detected:     1
## 
##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 10.42298 1.065668  14.9     653 4.201138   4.153388    TRUE
## 2 1 10.42018 1.060094  14.0     143 3.376887   4.153240   FALSE
## 3 2 10.41794 1.056631  14.0     145 3.390076   4.153091   FALSE
## 4 3 10.41570 1.053148  14.0     468 3.403421   4.152943   FALSE

Correlations

#corr of data
corrmatrix <- cor(data)
corrplot(corrmatrix, method = 'pie', type="upper")

Moderate corr of 0.6ish for following 1) fixed acidity - citric acid 2) fixed acidity - density 3) free.sulfur.dioxide - total.sulfur.dioxide

Removing fixed acidity and total.sulfur

#we will remove fixed acidity and total.sulfur
data <- data[-c(1,7)]

#above are visualized and validated via plot
plot(data)

Data preparation

We replaced outliers with 5th and 95th percentile values.

#replace outliers with 5th and 95th percentile values
#remember An outlier is not any point over the 95th percentile 
#or below the 5th percentile. Instead, an outlier is considered so 
#if it is below the first quartile – 1.5·IQR or above third quartile + 1.5·IQR.

capOutlier <- function(x){
   qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
   caps <- quantile(x, probs=c(.05, .95), na.rm = T)
   H <- 1.5 * IQR(x, na.rm = T)
   x[x < (qnt[1] - H)] <- caps[1]
   x[x > (qnt[2] + H)] <- caps[2]
   return(x)
}
data$volatile.acidity=capOutlier(data$volatile.acidity)
data$residual.sugar=capOutlier(data$residual.sugar)
data$chlorides=capOutlier(data$chlorides)
data$total.sulfur.dioxide=capOutlier(data$total.sulfur.dioxide)
data$density=capOutlier(data$density)
data$pH=capOutlier(data$pH)
data$sulphates=capOutlier(data$sulphates)
data$alcohol=capOutlier(data$alcohol)

#scale data
data_scaled <- data.frame(scale(data))

#visualize how well data was scaled
boxplot(data_scaled)

#scaling looks good
silhouette_score <- function(k){
  km <- kmeans(data, centers = k, nstart=25)
  ss <- silhouette(km$cluster, dist(data_scaled))
  mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)

Show Optimal number of clusters:

fviz_nbclust(data_scaled, kmeans, method='silhouette')

PCA and reduction

# ##### Build pca using princomp
data_pca1 <- princomp(data_scaled)

#examine the importance of PCs
summary(data_pca1)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     1.7683466 1.3882991 1.0542077 0.99435124 0.94105185
## Proportion of Variance 0.3129006 0.1928580 0.1112049 0.09893531 0.08861328
## Cumulative Proportion  0.3129006 0.5057587 0.6169636 0.71589893 0.80451221
##                            Comp.6     Comp.7    Comp.8     Comp.9
## Standard deviation     0.83825050 0.74793169 0.6311535 0.47081888
## Proportion of Variance 0.07031036 0.05597519 0.0398604 0.02218091
## Cumulative Proportion  0.87482257 0.93079776 0.9706582 0.99283907
##                            Comp.10
## Standard deviation     0.267515433
## Proportion of Variance 0.007160929
## Cumulative Proportion  1.000000000
# inspect principal components
# loadings shows variance and and how much each variable contributes to each components
loadings(data_pca1)
## 
## Loadings:
##                      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## fixed.acidity         0.497         0.147  0.233  0.132         0.270
## volatile.acidity     -0.231 -0.480         0.307        -0.101  0.645
## citric.acid           0.457  0.263        -0.112  0.141        -0.136
## residual.sugar        0.176 -0.141 -0.757  0.304  0.238        -0.154
## chlorides             0.234 -0.346               -0.482 -0.655 -0.343
## total.sulfur.dioxide        -0.288 -0.343 -0.766  0.280 -0.114  0.181
## density               0.410 -0.342         0.127         0.482       
## pH                   -0.422        -0.262        -0.275  0.393 -0.343
## sulphates             0.218  0.268 -0.235 -0.270 -0.710  0.199  0.378
## alcohol              -0.103  0.535 -0.390  0.249        -0.314  0.233
##                      Comp.8 Comp.9 Comp.10
## fixed.acidity         0.264  0.353  0.618 
## volatile.acidity      0.192 -0.389        
## citric.acid           0.348 -0.737        
## residual.sugar       -0.400         0.163 
## chlorides             0.133  0.114        
## total.sulfur.dioxide  0.229  0.180        
## density               0.253  0.195 -0.593 
## pH                    0.537         0.338 
## sulphates            -0.241               
## alcohol               0.363  0.286 -0.342 
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings       1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.1    0.1    0.1    0.1    0.1    0.1    0.1    0.1
## Cumulative Var    0.1    0.2    0.3    0.4    0.5    0.6    0.7    0.8
##                Comp.9 Comp.10
## SS loadings       1.0     1.0
## Proportion Var    0.1     0.1
## Cumulative Var    0.9     1.0
fviz_pca_var(data_pca1,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

Above shows that first 8 give us 89% of the variance We don’t see one variable being overbearing

#plot
plot(data_pca1)

# scree plot
plot(data_pca1, type = "lines")

#biplot
biplot(data_pca1, col = c("gray", "black"))

# ###### using princomp first 10 give us 90% of the variance

# ##### build pca using prcomp
data_pca2 <- prcomp(data_scaled)
summary(data_pca2)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7689 1.3887 1.0545 0.99466 0.94135 0.83851
## Proportion of Variance 0.3129 0.1929 0.1112 0.09894 0.08861 0.07031
## Cumulative Proportion  0.3129 0.5058 0.6170 0.71590 0.80451 0.87482
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.74817 0.63135 0.47097 0.26760
## Proportion of Variance 0.05598 0.03986 0.02218 0.00716
## Cumulative Proportion  0.93080 0.97066 0.99284 1.00000
#above shows first 6 account for 88% variance

#plot
plot(data_pca2)

# scree plot
plot(data_pca2, type = "lines")

#biplot
biplot(data_pca2, col = c("gray", "black"))

fviz_pca_var(data_pca2,
             col.var = "contrib", # Color by contributions
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

Above shows first 7 account for 90% variance

Please note: PCA didnt really didnt help at all because of the number of components. We were hoping for a reduction down to 3 - 5 components. stick with scaled data

Clustering using kmeans

determine number of Clusters

datatocluster <- data_scaled

Silhouette method recommends 2

fviz_nbclust(datatocluster, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

Elbow method seems to show elbow at 4 or 5

fviz_nbclust(datatocluster, kmeans, method = "wss")+
  labs(subtitle = "Elbow method")

Gap statistic recommends 1

#gap stat takes some time, uncomment to run it

set.seed(123)
#fviz_nbclust(datatocluster, kmeans, nstart = 25,  method = "gap_stat", nboot = 50) + labs(subtitle = "Gap statistic method")

#euclidean takes some time, uncomment to run it
#lets also try nbclust using hierarchical clustering
#recommends 4
#NbClust(data = datatocluster, diss = NULL, distance = "euclidean",min.nc = 2, max.nc = 15, method = "ward.D")

Optimal clusters is 4

k=4;

kmeans

fit <- kmeans(datatocluster, k, nstart=25, iter.max=200)
fit
## K-means clustering with 4 clusters of sizes 320, 382, 300, 597
## 
## Cluster means:
##   fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## 1   -0.57414243       -0.6042749  0.02563199     -0.3027505 -0.83944657
## 2    1.29529413       -0.6887925  1.15928827      0.2488172  0.38235662
## 3   -0.03712447        0.3360568  0.07558678      0.5274614  0.43915001
## 4   -0.50241112        0.5957616 -0.79351154     -0.2619873 -0.01538079
##   total.sulfur.dioxide    density         pH  sulphates    alcohol
## 1           -0.2161077 -1.2115084  0.4736841  0.3034858  1.2406304
## 2           -0.4441063  0.8315168 -0.8798589  0.5862033  0.1084757
## 3            1.4643972  0.4837596 -0.3572738 -0.1550951 -0.7100370
## 4           -0.3358729 -0.1257699  0.4886254 -0.4598268 -0.3776019
## 
## Clustering vector:
##    [1] 4 3 4 2 4 4 4 4 4 3 4 3 4 4 3 3 3 3 4 2 3 3 2 3 4 4 4 2 4 4 4 4 3 3
##   [35] 4 4 4 4 4 3 3 3 2 4 4 1 3 2 4 3 4 4 4 3 3 3 2 3 4 4 3 3 4 4 4 4 4 4
##   [69] 2 4 4 3 3 4 2 2 2 4 4 3 4 2 3 3 1 4 3 4 3 4 3 3 3 4 4 1 4 4 4 4 4 4
##  [103] 4 4 4 4 2 4 3 3 3 3 3 2 3 2 4 4 4 4 3 4 4 4 3 3 4 4 1 4 3 1 1 4 4 3
##  [137] 4 4 3 3 3 4 1 4 1 3 4 3 4 4 1 2 4 4 3 3 3 3 4 4 4 4 4 3 3 3 3 4 4 3
##  [171] 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 3 3 4 3 3 3 4 3 4 4 3 4 2 1 4 2 3 4 4
##  [205] 4 2 2 3 3 2 1 4 2 4 4 3 4 4 4 3 3 3 4 4 4 4 3 4 4 4 1 4 3 4 4 4 4 4
##  [239] 4 4 2 2 3 2 2 4 4 3 4 4 2 4 2 3 4 3 2 4 2 2 4 4 4 4 2 2 4 1 4 2 3 2
##  [273] 2 4 3 3 4 2 2 2 2 2 4 2 3 3 2 4 4 2 4 2 2 4 2 2 3 4 4 4 4 2 4 4 3 2
##  [307] 4 2 2 4 2 3 3 3 1 1 3 3 2 3 2 3 4 3 3 3 2 2 2 2 2 2 3 4 4 2 1 3 2 2
##  [341] 2 2 2 2 2 4 1 2 2 4 2 4 4 2 1 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2 4 2 2 3
##  [375] 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 1 2 2 3 2 2 3 2 2 4 3 1 2 2 4 1 2 2
##  [409] 2 2 3 3 3 2 3 3 2 3 2 4 2 4 4 2 4 4 1 4 4 2 2 3 2 2 2 2 4 2 2 4 2 2
##  [443] 2 2 1 4 2 2 4 2 2 2 4 2 1 2 4 3 2 2 2 4 2 3 2 2 2 1 2 4 2 2 2 2 2 2
##  [477] 2 2 2 4 2 2 2 2 2 4 4 4 2 2 3 1 1 3 1 2 4 3 2 3 4 2 2 2 2 2 2 2 3 2
##  [511] 2 3 2 2 2 3 2 2 2 3 2 4 3 3 3 3 3 1 3 3 2 2 2 1 2 2 3 4 2 2 3 2 4 2
##  [545] 2 3 4 2 2 2 4 2 2 1 2 2 2 2 2 2 2 3 3 3 2 2 4 4 2 1 2 1 2 2 2 2 4 3
##  [579] 3 2 2 2 2 2 2 4 2 3 1 2 3 1 3 2 4 3 2 2 4 2 4 2 4 2 3 4 2 3 3 1 2 2
##  [613] 4 1 3 3 3 2 2 2 3 3 4 1 3 3 4 4 4 3 4 2 4 3 3 4 3 3 4 2 2 3 2 3 2 4
##  [647] 4 4 1 3 2 3 2 2 2 3 2 2 4 4 4 4 4 2 2 3 2 2 2 2 4 4 3 4 2 2 2 4 3 2
##  [681] 2 4 3 4 3 4 4 3 4 2 4 3 2 3 3 1 4 4 3 2 3 4 4 3 4 3 4 4 1 2 3 3 4 4
##  [715] 3 4 4 4 4 4 4 3 4 3 4 4 4 4 4 4 2 4 4 3 4 4 4 4 3 4 4 3 4 2 3 4 3 3
##  [749] 4 4 4 4 3 4 2 4 4 4 4 3 3 4 4 4 4 3 3 3 4 4 4 3 3 2 2 4 4 4 4 4 4 4
##  [783] 3 4 4 2 2 3 3 3 3 3 4 4 1 2 3 1 2 2 3 4 1 4 4 1 1 1 4 4 4 2 2 1 2 2
##  [817] 2 2 4 3 4 1 4 4 4 4 1 4 1 1 4 1 2 2 4 4 1 1 2 4 2 4 2 3 2 4 4 4 4 4
##  [851] 2 2 3 1 1 4 1 1 2 1 4 1 4 4 4 4 1 1 1 4 1 4 4 1 2 2 4 1 3 4 4 4 2 4
##  [885] 3 4 4 2 4 3 1 4 2 4 4 4 1 4 1 4 1 4 4 4 4 3 3 4 4 1 2 2 1 2 1 1 4 3
##  [919] 1 1 2 1 1 3 1 1 2 3 1 1 4 4 4 4 4 1 1 2 1 1 1 2 2 2 1 2 2 1 1 1 1 1
##  [953] 1 1 1 2 2 2 4 4 1 4 4 2 1 1 2 3 1 4 2 2 2 1 2 4 4 3 1 2 4 4 1 4 2 4
##  [987] 1 4 4 2 4 4 4 4 3 4 1 1 4 1 1 2 1 1 4 1 1 1 2 2 1 2 4 4 4 2 1 1 1 4
## [1021] 2 2 4 1 4 4 1 4 4 4 4 4 4 4 4 2 1 4 1 1 4 4 1 2 1 1 4 4 2 2 4 2 1 1
## [1055] 3 3 2 3 2 2 2 1 1 2 1 4 1 2 2 3 1 3 3 4 3 2 2 2 2 1 1 1 4 1 4 4 1 1
## [1089] 2 2 2 1 1 1 4 2 4 4 1 4 1 1 1 1 1 1 1 1 4 2 4 1 1 2 1 4 4 4 1 1 1 1
## [1123] 1 2 4 1 1 4 3 2 4 1 1 4 1 1 2 2 3 3 3 1 1 1 1 1 4 2 1 1 1 1 4 2 4 4
## [1157] 1 1 1 2 2 2 1 4 4 2 2 1 1 1 2 4 1 4 4 4 4 1 1 1 1 1 2 4 3 1 4 1 3 4
## [1191] 2 4 1 4 4 4 3 4 1 3 4 1 1 3 1 1 1 2 1 1 4 4 4 2 2 1 3 1 1 1 2 2 3 2
## [1225] 2 3 3 4 1 4 1 4 4 2 1 3 4 1 4 1 4 2 1 3 3 4 4 4 1 4 4 4 4 4 4 4 4 3
## [1259] 4 4 2 4 3 4 1 4 4 2 4 1 1 1 1 4 4 3 1 4 3 1 4 4 4 3 1 2 1 1 3 3 4 4
## [1293] 1 4 4 3 3 1 1 4 1 4 1 1 3 3 3 4 3 4 3 1 4 4 4 3 1 2 3 2 3 1 1 1 1 1
## [1327] 1 1 4 3 3 3 4 4 4 1 4 4 4 4 4 4 4 4 2 4 1 4 4 4 3 1 4 4 4 4 4 1 3 2
## [1361] 2 4 2 4 4 4 4 3 3 4 3 2 3 3 4 3 4 1 4 4 4 4 3 3 3 3 4 4 3 3 1 4 4 4
## [1395] 3 4 4 3 4 4 3 3 1 1 4 1 2 1 1 1 4 1 2 3 2 4 2 1 4 3 4 4 1 4 2 2 1 1
## [1429] 4 1 1 3 1 1 3 3 3 4 4 4 1 3 4 1 4 3 4 4 4 1 1 2 1 3 2 4 1 3 1 1 4 4
## [1463] 4 4 4 4 4 4 4 4 4 1 1 1 3 1 3 1 4 2 4 2 4 1 4 4 4 4 1 1 1 1 1 3 1 1
## [1497] 3 4 4 4 4 3 4 1 1 4 4 1 2 1 1 4 4 1 3 3 1 1 4 4 1 4 1 4 1 4 4 4 2 4
## [1531] 1 4 4 3 1 4 4 4 1 1 1 1 4 2 1 4 4 1 2 1 4 4 4 4 4 1 4 4 3 3 3 3 4 4
## [1565] 4 1 1 4 4 1 1 1 3 1 3 1 1 4 4 1 1 1 1 3 1 1 1 1 1 3 1 4 1 4 4 1 1 4
## [1599] 1
## 
## Within cluster sum of squares by cluster:
## [1] 2097.204 2876.922 2025.222 2940.944
##  (between_SS / total_SS =  37.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# Cluster sizes
sort(table(fit$cluster))
## 
##   3   1   2   4 
## 300 320 382 597

visualize and analyze clusters generated by kmeans

datatovisualize1 <- data_scaled

#clusplot below, but is not useful as we performed analysis in 9 dimensions
#clusplot below uses first 2 dimensions which covers 47% variability
#too much overlap in 2d
#should really plot pairs or each variable against cluster
clusplot(datatovisualize1, fit$cluster,cex=1,xlab=colnames(datatovisualize1)[1],ylab=colnames(datatovisualize1)[2],col.p=fit$cluster,lines=0,labels=1)

#pairwise plot
#pairwise keeps running and crashes
#uncomment if need to run
#pairs(datatovisualize1, col=c("red","blue","green","yellow")[fit$cluster])

#put quality back in data
datatovisualize1$quality <- raw$quality

old.par <- par(mar = c(2, 2, 2, 2))
par(mfrow=c(2,2))


for(i in 1:10){
  boxplot(datatovisualize1[,i] ~ fit$cluster,
        xlab='Cluster Number', ylab=colnames(datatovisualize1)[i],
        main=paste('Clusters of ', as.character(colnames(datatovisualize1)[i])))
}

#reset graphics
par(old.par)

#cluster 3 is higher quality than others
#why is that?
# cluster 3: alcohol, pH are higher
# cluster 3: sugar, chlorides, density are lower

#put cluster back into raw data so we can save raw data and show in shiny app
raw$cluster_kmeans=fit$cluster

Clustering using Hierarchical Clustering

We implement a Ward’s hierarchical clustering procedure:

#distance matrix
d <- dist(data_scaled, method = "euclidean") 
#clustering
h_clust <- hclust(d, method = "ward.D2") 

Display dendrogram with cut rects

plot(h_clust)
rect.hclust(h_clust , k = 5, border = 2:8)
abline(h = 5, col = 'red')

Display clusters in different colors

avg_dend_obj <- as.dendrogram(h_clust)
avg_col_dend <- color_branches(avg_dend_obj, h =  40)
plot(avg_col_dend)

Extract clusters

groups <- cutree(h_clust,k=5)

Count number of instances in each group

data_df_cl <- mutate(data, cluster = groups)
count(data_df_cl, cluster)
## # A tibble: 5 x 2
##   cluster     n
##     <int> <int>
## 1       1   526
## 2       2   361
## 3       3   382
## 4       4   153
## 5       5   177

We can analysis the quality and

summary(data_df_cl)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar 
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   :0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.:1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median :2.200  
##  Mean   : 8.32   Mean   :0.5243   Mean   :0.271   Mean   :2.463  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.:2.600  
##  Max.   :15.90   Max.   :1.0100   Max.   :1.000   Max.   :5.100  
##    chlorides       total.sulfur.dioxide    density             pH      
##  Min.   :0.04100   Min.   :  6.00       Min.   :0.9923   Min.   :2.93  
##  1st Qu.:0.07000   1st Qu.: 22.00       1st Qu.:0.9956   1st Qu.:3.21  
##  Median :0.07900   Median : 38.00       Median :0.9968   Median :3.31  
##  Mean   :0.08167   Mean   : 45.37       Mean   :0.9967   Mean   :3.31  
##  3rd Qu.:0.09000   3rd Qu.: 62.00       3rd Qu.:0.9978   3rd Qu.:3.40  
##  Max.   :0.12610   Max.   :122.00       Max.   :1.0010   Max.   :3.68  
##    sulphates         alcohol         cluster     
##  Min.   :0.3300   Min.   : 8.40   Min.   :1.000  
##  1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:1.000  
##  Median :0.6200   Median :10.20   Median :2.000  
##  Mean   :0.6472   Mean   :10.41   Mean   :2.433  
##  3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:3.000  
##  Max.   :0.9900   Max.   :13.50   Max.   :5.000
ggplot(data_df_cl, aes(x=density, y = residual.sugar , color = factor(cluster))) + geom_point()

clusplot(data_df_cl, fit$cluster,cex=1,xlab=colnames(data_df_cl)[1],ylab=colnames(data_df_cl)[2],col.p=groups,lines=0,labels=1)

visualize and analyze clusters generated by hierarchical clustering

datatovisualize2 <- data_scaled

2D representation of the Segmentation:

clusplot(datatovisualize2, groups, main='Clusters')

# pairwise plot
# pairs(datatovisualize2, col=c("red","blue","green","yellow")[groups])
#put quality back in data
datatovisualize2$quality <- raw$quality
par(old.par)
par(mfrow=c(2,2))

for(i in 1:10){
  boxplot(datatovisualize2[,i] ~ groups,
        xlab='Cluster Number', ylab=colnames(datatovisualize2)[i],
        main=paste('Clusters of ', as.character(colnames(datatovisualize2)[i])))
}

#reset graphics
par(old.par)

#cluster 4 is higher quality than others
#why is that?
# cluster 4: alcohol is higher
# cluster 4: chlorides, density are lower

#put cluster back into raw data so we can save raw data and show in shiny app
raw$cluster_hclust=groups
#TODO save data to file to load in shiny


save(raw, file = "./WineAnalysis/shiny.RData")

Shiny App

Link: [https://yumingcui.shinyapps.io/WineAnalysis/]